## This file reads in HILDA panel data with consistent column names, and saves in .qs format.


# Preliminaries -----------------------------------------------------------

rm(list=ls())
gc()

memory.limit(size=25000)


# Master package loading
source("./R scripts/Master package loading.R", encoding="ISO-8859-1")


## hilda file paths
hilda_dta_path <- "./HILDA Wave 19/2. STATA 190/" ## Folder where input Stata .dta files are saved
hilda_qs_path <- "./HILDA Wave 19/qs files/" ## Folder where output .qs files should be saved


## get file locations of combined dta files
combined_files <- list.files(hilda_dta_path) %>% 
  str_subset("Combined_.+") %>% 
  str_remove(".dta")


# Read and save HILDA data with generic column names --------------------------------------------------------------

## dataframe for matching year / wave ID / wave letter in HILDA
waves <- tibble(letter = letters[1:length(combined_files)], 
                year = c(2001:(2000+length(combined_files)))
                ) %>% 
  rowid_to_column(., "id")

## read combined dta files for each wave and save as qs file 
lapply(c(1:length(combined_files)),
  function(n) {
    print(n) ## print which wave the function is up to
    ## read in whole data file
    read_dta(paste0(hilda_dta_path, combined_files[n], ".dta")) %>% 
      ## remove wave prefix on column names if it is a wave identifier (probably all vars except some start with x)
      setNames(names(.) %>% str_remove(paste0("^", waves$letter[n]))) %>% 
      ## add new column with wave number
      mutate(wavenumber=n) %>% 
      ## save as qs file
      qsave(., paste0(hilda_qs_path, combined_files[n], ".qs"))
  }
)